Specify parameters

# Specify the parameters required for the analysis
# This dir should have the motrpac-bic-norm-qc repo
repo_local_dir = "~/Desktop/repos/"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "~/Desktop/MoTrPAC/data/pass_1b/rrbs/"
# Bucket structure is: tissue/platform/results/<data files>. For GET, this path should
# also have a qa_qc directory with the qc_metrics and sample_metadata files.
bucket = "gs://motrpac-release-data-staging/Results/pass1b-06/epigenomics/"
# Specify bucket and local path for the phenotypic data
# Specify bucket and local path for the phenotypic data
# this path is internal to BIC - faster
# pheno_bucket = "gs://bic_data_analysis/pass1b/pheno_dmaqc/merged2020-06-12/"
pheno_bucket = "gs://motrpac-internal-release2-results/pass1b-06/phenotype/"
pheno_local_dir = "~/Desktop/MoTrPAC/data/pass_1b/dmaqc_pheno/"

# we require a site to have a total count
# (both methylated and unmethylated) of at least min_count_coverage in 
# at least min_per_samples * (#samples); 
# specifying min_per_samples=1 means all samples
min_count_coverage = 10
min_per_samples = 1
# parameters for defining promoters: 
# define region sizes around TSS per gene
map_to_promoters=T
promoter_coord = c(-1000,2000)
# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("reads","pct_Unaligned","pct_GC","pct_chrM")
biospec_cols = c("registration.sex","key.anirandgroup",
                 "registration.batchnumber",
                 "training.staffid",
                  "is_control",
                  "vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
                  "terminal.weight.mg","time_to_freeze",
                  "timepoint","bid","pid")
# load functions and libraries
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/config_session.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/association_analysis_methods.R"))

Get the Rat annotations:

# manage the gene annotation
# # Install the rat db
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("org.Rn.eg.db")
library("org.Rn.eg.db")
## Loading required package: AnnotationDbi
## 

Load the RRBS data from the bucket and filter low counts

This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/ Note that RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix.

First, download the data to local dir (skip this if already downloaded, these are large data tables).

obj = download_bucket_to_local_dir(bucket,local_path=local_data_dir,
             remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# delete bismark files
bis_files = obj$downloaded_files[
  grepl("bismark.cov.gz",obj$downloaded_files)
]
rem_obj = sapply(bis_files,file.remove)

Now load and filter the RRBS data:

obj = list(
  downloaded_files = list.files(local_data_dir,full.names = T,recursive = T)
)
obj$downloaded_files = obj$downloaded_files[
  grepl("epigen-rrbs",obj$downloaded_files)
]
rdata_files = obj$downloaded_files[
  grepl(".RData$",ignore.case = T,obj$downloaded_files)
]
names(rdata_files) = sapply(rdata_files,GET_get_tissue_from_download_path)
tissues = names(rdata_files)
tissue2rrbs_data = list()
for (tissue in tissues){
  curr_rdata_file = rdata_files[grepl(tissue,rdata_files)]
  if(length(curr_rdata_file)!=1){
    print(paste("Warning: more than a single RData file in tissue:",tissue))
  }
  yall = get(load(curr_rdata_file))
  
  # Filtering unassembled chromosomes
  keep <- rep(TRUE, nrow(yall))
  Chr <- as.character(yall$genes$Chr)
  keep[ grep("random",Chr) ] <- FALSE
  keep[ grep("chrUn",Chr) ] <- FALSE
  # remove non-chr ones
  keep[!grepl("chr",Chr) ] <- FALSE
  # remove M chromosome (otherwise we get error when assigning annotation below)
  keep[Chr=="chrM"] <- FALSE
  print(paste("Data from tissue",tissue,"was loaded into session"))
  print(paste("how many sites were removed (unassembled+chrM)?",sum(!keep)))
  # keep.lib.sizes=FALSE causes the library sizes to be recomputed
  yall <- yall[keep,, keep.lib.sizes=FALSE]
  # fix the levels and order by chromosome
  # rat genome chromosomes:
  ChrNames <- paste0("chr",c(1:20,"X","Y"))
  yall$genes$Chr <- factor(yall$genes$Chr, levels=ChrNames)
  o <- order(yall$genes$Chr, yall$genes$Locus)
  yall <- yall[o,]
  print(paste("Counts matrix dim before low counts filter:"))
  print(dim(yall))
  gc()
  
  # get locations, gene ids, etc.
  TSS <- nearestTSS(yall$genes$Chr, yall$genes$Locus, species="Rn")
  yall$genes$EntrezID <- TSS$gene_id
  yall$genes$Symbol <- TSS$symbol
  yall$genes$Strand <- TSS$strand
  yall$genes$Distance <- TSS$distance
  yall$genes$Width <- TSS$width
  head(yall$genes)
  
  if(map_to_promoters){
      # Map data to promoters and sum over regions
      InPromoter <- yall$genes$Distance >= promoter_coord[1] & yall$genes$Distance <= promoter_coord[2]
      # We subset the CpGs to those contained in a promoter region:
      yIP <- yall[InPromoter,,keep.lib.sizes=FALSE]
      # Compute total counts
      ypr <- rowsum(yIP, yIP$genes$EntrezID, reorder=FALSE)
      yall = ypr
      gc()
  }
  
  # Remove low counts (by promoter)
  # The analysis needs to be restricted to CpG sites that have enough coverage for the
  # methylation level to be measurable in a meaningful way at that site. 
  # As a conservative rule of thumb, we require a site to have a total count
  # (both methylated and unmethylated) of at least 8 in every sample.
  Coverage <- yall$counts[,grepl("-Me",colnames(yall$counts))] + 
    yall$counts[, grepl("-Un",colnames(yall$counts))]
  gc()
  # see description of min_count_coverage and min_per_samples above
  keep <- rowSums(Coverage >= min_count_coverage) >= min_per_samples*ncol(Coverage)
  # filter the data
  y <- yall[keep,, keep.lib.sizes=FALSE]
  print(paste("Counts matrix dim after low counts filter:"))
  print(dim(y))
  
  # Data normalization
  # A key difference between BS-seq and other sequencing data is that the pair of libraries 
  # holding the methylated and unmethylated reads for a particular sample are treated as a unit.
  # To ensure that the methylated and unmethylated reads for the same sample are treated on the
  # same scale, we need to set the library sizes to be equal for each pair of libraries. 
  # We set the library sizes for each sample to be the average of the total read counts for 
  # the methylated and unmethylated libraries.
  # Other normalization methods developed for RNA-seq data, such as TMM, are not required for BS-seq data.
  TotalLibSize <- y$samples$lib.size[grepl("-Me",colnames(yall$counts))] +
    +       y$samples$lib.size[grepl("-Un",colnames(yall$counts))]
  y$samples$lib.size <- rep(TotalLibSize, each=2)
  
  tissue2rrbs_data[[tissue]] = y
  
  rm(yall);rm(y);gc()
}
## [1] "Data from tissue t55-gastrocnemius was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74600"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6869469     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12333   104
## [1] "Data from tissue t58-heart was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72607"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6597238     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12324   104
## [1] "Data from tissue t59-kidney was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 73240"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6638813     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12259   104
## [1] "Data from tissue t68-liver was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 71495"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6499281     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12239   104
## [1] "Data from tissue t69-brown-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 78814"
## [1] "Counts matrix dim before low counts filter:"
## [1] 7385104     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12382   104
## [1] "Data from tissue t70-white-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74742"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6962440     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12294   104
# Get the pipeline qc scores
pipeline_scores_file = obj$downloaded_files[
    grepl("qa-qc-metrics",obj$downloaded_files) & 
      grepl(".csv$",obj$downloaded_files)
  ]
if(length(pipeline_scores_file)!=1){
  print("Error: RRBS data does not have a single pipeline scores file")
}
pipelineqc_scores = read.csv(pipeline_scores_file)
rownames(pipelineqc_scores) = as.character(pipelineqc_scores$vial_label)

Add the phenotypic data

pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
             remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue = 
  pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze = 
  pheno_data$viallabel_data$calculated.variables.frozetime_after_train - 
  pheno_data$viallabel_data$calculated.variables.deathtime_after_train
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
  pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
  v = rep(0,length(x))
  tps = c("Eight"=8,"Four"=4)
  for(tp in names(tps)){
    v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
  }
  return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
  pheno_data$viallabel_data$key.anirandgroup
)

Dataset preprocessing

We transform the normalized data into M values and quantile normalize the resulting matrix. These can be interpreted as the base2 logit transformation of the proportion of methylated signal at each locus.

tissue2Mmatrix = list()
for(tissue in names(tissue2rrbs_data)){
  y = tissue2rrbs_data[[tissue]]
  Me <- y$counts[, grepl("-Me",colnames(y$counts))]
  Un <- y$counts[, grepl("-Un",colnames(y$counts))]
  # The M-value can be interpreted as the base2 logit transformation of the
  # proportion of methylated signal at each locus:
  M <- log2(Me + 2) - log2(Un + 2)
  colnames(M) <- gsub("-Me","",colnames(M))
  # rownames(M) == y$genes$EntrezID # sanity check
  # # MDS plot
  # # In this plot the distance between each pair of samples represents the average logit 
  # # change between the samples for the top most differentially methylated
  # # CpG loci between that pair of samples.
  # sex = sample_covs[colnames(M),1]
  # sex[sex==1] = "M";sex[sex==2] = "F"
  # cols = rep("blue",length(sex))
  # cols[sex=="M"] = "green"
  # # plotMDS(M,labels = sex,pch = sample_covs[colnames(M),1]+20,col=cols,main = tissue)
  samp = sample(1:ncol(M))[1:20]
  samp2 = sample(1:nrow(M))[1:min(20000,nrow(M))]
  boxplot(M[samp2,samp],
          main=paste0(tissue,"\nbefore quantile (",nrow(M)," promoters)"),
          ylab = "log2 M values",xaxt="n")
  M = run_quantile_normalization(M)
  boxplot(M[samp2,samp],
          main=paste0(tissue,"\nafter quantile (",nrow(M)," promoters)"),
          ylab = "log2 M values",xaxt="n")
  tissue2Mmatrix[[tissue]] = M
}

MOP-flagged samples

The RRBS pipeline manual of procedures (MOP) defines a flagged sample (e.g., potentially problematic) as a sample that satisfies one of the following: (1) number of read pairs <20M, (2) Low number of uniquely mapped reads: pct_Uniq <50%, (3) mapped read count (pct_Uniq*reads) < 50% of the median mapped read count of all samples within a tissue (within a sequencing batch), (4) bisulfite conversion efficiency less than 95%, i.e lambda_pct_CpG>5%, but limited to samples with lambda_pct_Uniq>0.5%, (5)unexpected strands mapped: pct_OT<30% or pct_OB>70%; pct_CTOT>10% or pct_CTOB>10%, and (6) high duplications based on UMI: pct_umi_dup >20%.

pipelineqc_scores$pipeline_flags =  rrbs_pipeline_flagged_samples(pipelineqc_scores)
mop_flagged_samples = rownames(pipelineqc_scores)[nchar(pipelineqc_scores$pipeline_flags)>0]

PCA visualization (sex and group)

We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.

tissue2pca = list()
for(tissue in names(tissue2Mmatrix)){
  curr_data = tissue2Mmatrix[[tissue]]
  # remove vial labels that start with 8 (reference samples)
  curr_data = curr_data[,grepl("^9",colnames(curr_data),perl = T)]
  # remove zero variance rows
  curr_data = curr_data[apply(curr_data,1,sd)>0,]
    
  curr_pca = prcomp(t(curr_data),scale. = T,center = T,rank. = num_pcs)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]

  # plot
  df = data.frame(curr_pcax,
    randgroup = pheno_data$viallabel_data[rownames(curr_pcax),"key.anirandgroup"],
    sex = as.character(pheno_data$viallabel_data[rownames(curr_pcax),"registration.sex"]),
    stringsAsFactors = F
  )
  df$sex[df$sex=="1"] = "F"
  df$sex[df$sex=="2"] = "M"
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
      ggtitle(tissue)
  plot(p)
  tissue2pca[[tissue]] = df
}

PC-based association analysis

Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance.

For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.

pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(currname in names(tissue2Mmatrix)){
  curr_pcax = tissue2pca[[currname]]
  curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
  # get the metadata of the samples
  curr_meta = cbind(pipelineqc_scores[rownames(curr_pcax),pipeline_qc_cols],
                     pheno_data$viallabel_data[rownames(curr_pcax),biospec_cols])
  # remove metadata variables with NAs
  curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
  # remove bid and pid
  curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
  # remove the text group description
  curr_meta = curr_meta[!grepl("randgr",names(curr_meta))]
  # remove fields withe zero variance
  curr_meta = curr_meta[,apply(curr_meta,2,sd)>0]
  
  # Assumption here: all metadata values are numeric
  corrs = cor(curr_pcax,curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_pcax[rownames(curr_meta),],
    curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
    ggtitle(currname) +
    theme(plot.title = element_text(hjust = 0.5,size=20)))
  
  for(i in 1:nrow(corrsp)){
    for(j in 1:ncol(corrsp)){
      if(corrsp[i,j]>p_thr){next}
      pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
            c(currname,
              rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
            )
    }
  }
  colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
                                  "qc_metric","rho(spearman)","p-value")
  
  ####
  # compute correlations among the metadata variables
  ####
  corrs = cor(curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(corrs,lab=T,lab_size=1.5,hc.order = T))
  
  for(n1 in rownames(corrsp)){
    for(n2 in rownames(corrsp)){
      if(n1==n2){break}
      if(n1 %in% biospec_cols &&
         n2 %in% biospec_cols) {next}
      if(corrsp[n1,n2]>p_thr){next}
      metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
            c(currname,n1,n2,corrs[n1,n2],corrsp[n1,n2])
            )
    }
  }
  if(!is.null(dim(metadata_variable_assoc_report))){
    colnames(metadata_variable_assoc_report) = c(
      "Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
  }
}

# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
  as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
  as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
  as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
  as.numeric(metadata_variable_assoc_report[,4]),digits=3)

PCA outliers

In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.

Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples.

pca_outliers_report = c()
for(currname in names(tissue2pca)){
  curr_pcax = tissue2pca[[currname]]
  curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
  
  # Univariate: use IQRs
  pca_outliers = c()
  for(j in 1:ncol(curr_pcax)){
    outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
    for(outlier in names(outlier_values)){
      pca_outliers_report = rbind(pca_outliers_report,
       c(currname,paste("PC",j,sep=""),outlier,
         format(outlier_values[outlier],digits=5))
      )
      if(!is.element(outlier,names(pca_outliers))){
        pca_outliers[outlier] = outlier_values[outlier]
      }
    }
  }
  
  # Plot the outliers
  if(length(pca_outliers)>0){
    # print(length(kNN_outliers))
    df = data.frame(curr_pcax,
                outliers = rownames(curr_pcax) %in% names(pca_outliers))
    col = rep("black",nrow(df))
    col[df$outliers] = "green"
    plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC1",ylab="PC2")
    plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC3",ylab="PC4")
  }
}
if(!is.null(dim(pca_outliers_report))){
  colnames(pca_outliers_report) =  c("dataset","PC","sample","score")
}

Sex checks

We verify that the sex of a sample can be predicted from the percent of chromosome Y and percent of chromosome X reads. This is done automatically by training a logistic regression classifier, but we also plot the 2D plot for each tissue.

# sex checks - using simple logistic regression classifier (LOO-CV)
sex_read_info = cbind(pipelineqc_scores$pct_chrX,pipelineqc_scores$pct_chrY)
rownames(sex_read_info) = rownames(pipelineqc_scores)
sex_check_outliers = c()
for(currname in names(tissue2Mmatrix)){
  # 1 is female
  pca_df = tissue2pca[[currname]]
  read_info = sex_read_info[rownames(pca_df),]
  df = data.frame(sex=as.factor(pca_df$sex),
                  pct_chrX = read_info[,1],
                  pct_chrY = read_info[,2],stringsAsFactors = F)
  p = ggplot(df) + 
      geom_point(aes(x=pct_chrX, y=pct_chrY,col=sex, shape=sex)) +
      ggtitle(paste0(currname," - sex check"))
  plot(p)
  train_control <- trainControl(method = "cv", number = nrow(df),
                                savePredictions = TRUE)
  # train the model on training set
  model <- train(sex ~ .,data = df,
               trControl = train_control,
               method = "glm", family=binomial(link="logit"))
  # CV redults
  cv_res = model$pred
  cv_errors = cv_res[,1] != cv_res[,2]
  err_samples = rownames(df)[cv_res[cv_errors,"rowIndex"]]
  for(samp in err_samples){
    sex_check_outliers = rbind(sex_check_outliers,
                               c(currname,samp))
  }
}

if(! is.null(dim(sex_check_outliers))){
  colnames(sex_check_outliers) = c("dataset","sample")
}